# Selenium browser imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
# All other imports
import pandas as pd
import numpy as np
import requests
import lxml.html as lx
from lxml import etree
from bs4 import BeautifulSoup
import re
import time
import csv
import numpy as np
import pandas as pd
# Get the text file of FIPS data for states and counties
response = requests.get('https://transition.fcc.gov/oet/info/maps/census/fips/fips.txt')
fips_data = response.text
# Modify the text output
lines = fips_data.split('\n')
fip_code = []
names =[]
for line in lines:
if line.startswith(' ' * 3):
state_code = line[:11].strip()
fip_code.append(state_code)
state_name = line[11:].strip()
names.append(state_name)
# Empty list to store new values
new_names = []
# Remove "county" from each list value
for name in names:
new_name = name.replace(' County', '')
new_names.append(new_name)
# Extract the county names and code
county_name = new_names[54::]
county_code = fip_code[54::]
# Extract the state names and code
state_codes = fip_code[3:54]
state_names = new_names[3:54]
# Fips data table for states
fips_data_states = {'Code': state_codes, 'Name': state_names}
fips_data_states_df = pd.DataFrame(fips_data_states)
# Create new index
fips_data_states_df['Index'] = np.arange(1,len(fips_data_states_df)+1)
# List of state abbreviations
state_abbrev = 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'
# Add state abbreviations to state data frame
fips_data_states_df['State Abbrev'] = state_abbrev
# Fips data table for counties
fips_data_county = {'Code': county_code, 'Name': county_name}
fips_data_county_df = pd.DataFrame(fips_data_county)
# Create new column with state codes
fips_data_county_df['State Code'] = fips_data_county_df['Code'].str[:2]
# Capitalize county names
fips_data_county_df['Name'] = fips_data_county_df['Name'].str.upper()
# Create a function to extract data for each state
def scrape_train_accidents(fips_data_states_df):
# Set target url
url = "https://safetydata.fra.dot.gov/OfficeofSafety/publicsite/query/inctmap.aspx"
# Open browser
chrome_options = Options()
chrome_options.headless = False
browser = webdriver.Chrome('./chromedriver', options=chrome_options)
browser.implicitly_wait(10)
# Open website
browser.get(url)
# Initialize empty lists
county_names = []
accs = []
state_code = []
# Iterate through each state
for i in range(len(fips_data_states_df)):
print(i) #sanity check
browser.implicitly_wait(10)
browser.get(url)
# Select state and input state into browser
try:
time.sleep(0.5)
selector = fips_data_states_df['Index'][i]
select_state = ("/html/body/form/div[9]/table/tbody/tr[3]/td[2]/select/option[%i]" %selector)
state_dropdown = browser.find_element("xpath",select_state)
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, select_state)))
state_dropdown.click()
browser.implicitly_wait(10)
except NoSuchElementException:
print("Select_state NA")
continue
# Generate report for each state
time.sleep(0.5)
generate_report = browser.find_element("xpath","/html/body/form/div[9]/table/tbody/tr[8]/td/input[1]")
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, "/html/body/form/div[9]/table/tbody/tr[8]/td/input[1]")))
generate_report.click()
browser.implicitly_wait(10)
# Retrieve the table from website with values
try:
time.sleep(10)
table_xpath = '//*[@id="ContentPlaceHolder1_divSAS"]/div/div/div/table'
#find_table = browser.find_element("xpath",table_xpath).get_attribute('outerHTML')[0]
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, table_xpath)))
table_df = pd.read_html(browser.find_element("xpath",table_xpath).get_attribute('outerHTML'))[0]
# Number of counties for each state
number_of_counties = table_df['County']['County']
# List with county names
county_names_placeholder = list(table_df['County']['County'])
county_names.extend(county_names_placeholder)
# List with accident totals for each county
accs_placeholder = list(table_df['Totals']['Accs'])
accs.extend(accs_placeholder)
# List with state code for each county
state_code_placeholder = [fips_data_states_df['Code'][i] for county in range(len(number_of_counties))]
state_code.extend(state_code_placeholder)
# Output all data in a list
all_data = [county_names, accs, state_code]
except TimeoutException: #Ignores the timeout, some states have a broken website link
print("Table_df NA")
continue
# Close the chrome browser when done
browser.quit()
#return all state data
return all_data
# Run the function an store everything as an output
output = scrape_train_accidents(fips_data_states_df)
# Save the ouput to csv file!!
file_name = "output.csv"
# Open the file in write mode
with open(file_name, mode='w', newline='') as csv_file:
# create a csv writer object
writer = csv.writer(csv_file)
# Loop through the list of lists and write each row to the csv file
for row in output:
writer.writerow(row)
# Read saved output so function doesn't need to run again
output_df = pd.read_csv('output.csv', header = None)
# Preview imported csv
output_df
# Convert output to dataframe
output_df = pd.DataFrame(output)
output_df
# Transpose data frame (not needed for csv)
output_df = output_df.T
# Create column names
output_df.columns = ['County Names','Accidents','State Code']
# Match county names with county code
county_code = []
for county in range(len(output_df['County Names'])):
try:
state_code = output_df['State Code'][county]
county_name = output_df['County Names'][county]
county_code_placeholder = fips_data_county_df.loc[(fips_data_county_df['State Code']== state_code) & (fips_data_county_df['Name']== str(county_name))].values[0]
county_code.append(county_code_placeholder[0])
except IndexError:
# print(county_name,'County code NA')
county_code.append('NA')
continue
# Create new column with state codes
output_df['County Code'] = county_code
# Drop rows that have 'NA' for county code
output_df.drop(output_df[output_df['County Code'] == 'NA'].index, inplace = True)
# Extract state abbreviations and convert to list
state_abbrev = list(fips_data_states_df['State Abbrev'])
# Save the new data frame again
# save to pickle file
output_df.to_pickle("output_df.pkl")
# read DataFrame from pickle file
df = pd.read_pickle("my_data.pkl")
# Another round of imports for plotting data
import plotly.express as px
from urllib.request import urlopen
import json
# Convert accident values to integers
output_df['Accidents'] = output_df['Accidents'].astype(int)
# For ploting, sort accident values
sorted_output_df = output_df.sort_values('Accidents')
# Download and load a GeoJSON file to get county map coordinates/template
url = 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'
response = requests.get(url)
with open('geojson-counties-fips.json', 'wb') as counties:
counties.write(response.content)
# Load the GeoJSON file for US counties
with open('geojson-counties-fips.json', 'r') as counties:
counties_geojson = json.load(counties)
# Download and load a GeoJSON file to get state map coordinates/template
url = 'https://gist.githubusercontent.com/wavded/1250983/raw/bf7c1c08f7b1596ca10822baeb8049d7350b0a4b/fipsToState.json'
response = requests.get(url)
with open('fipsToState.json', 'wb') as states:
states.write(response.content)
# Load the GeoJSON file for US states
with open('fipsToState.json', 'r') as states:
states_geojson = json.load(states)
# Group states by FIPS code and accidents, then sum for totals
sum_by_state = output_df.groupby('State Code')['Accidents'].sum()
# Reset index
df_sum_by_state = sum_by_state.to_frame().reset_index()
# Add state abbrev to data frame
# Modify the state_abbrev list to match states in dataframe
delete_index = [1,8,11,18,29,39,45] # define a list of indices to delete
# Iteratively delete indices values
for index in sorted(delete_index, reverse=True):
del state_abbrev[index]
# Add to data frame
df_sum_by_state['State Abbrev'] = state_abbrev
# Sort data by accident number
sorted_sum_by_state = df_sum_by_state.sort_values('Accidents')
# Sort data by index
sorted_sum_by_index = sorted_sum_by_state.sort_index()
# Output a state map of accidents
state_map = px.choropleth(sorted_sum_by_state,
locations='State Abbrev',
color='Accidents',
color_continuous_scale="spectral_r",
hover_name = 'State Code',
locationmode = 'USA-states',
labels={'acc':'train accidents'},
scope='usa')
state_map.show()
# Output a county map of accidents
county_map = px.choropleth_mapbox(sorted_output_df, geojson=counties_geojson, locations='County Code', color='Accidents',
color_continuous_scale="Viridis",
range_color=(0, max(sorted_output_df['Accidents'])),
mapbox_style="carto-positron",
zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
opacity=0.5,
labels={'acc':'train accidents'}
)
county_map.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
county_map.show()
# Create a historgram to show train accidents by state
hist = px.bar(x=sorted_sum_by_index['State Abbrev'], y=sorted_sum_by_index['Accidents'], labels={'x':'States', 'y':'Train Accidents'})
hist.show()